#https://datatables.net/reference/option/
options(DT.options = list(scrollX = TRUE, pagin=TRUE, fixedHeader = TRUE, searchHighlight = TRUE))

Introduction

Check out this Kaggle

Business Goals

  1. Predict the number of adults for the resort and city hotels for the next 4 weeks
    • Note: one booking can be for multiple adults, so aggregate accordingly
  2. Classify a booking as a resort or city type
  3. Predict average daily rate (adr) prices based on other features

Ideas

  1. fb’s prophet package
  2. logistic regression / random forests / svm
  3. elastic net, random forest, regression

Get Data

a = read_csv('hotel_bookings.csv') %>%
  clean_names() %>% 
  mutate(across(where(is.character), factor)) %>% 
  select(sort(tidyselect::peek_vars())) %>% 
  select(
    where(is.Date),
    where(is.factor),
    where(is.numeric)
  ) %>% filter(is_canceled == 0) #filter to non-canceled bookings

a$is_canceled = NULL

Split Data

set.seed(345)
split = initial_split(a, strata = hotel)
train = rsample::training(split)
test = rsample::testing(split)

Resources

  1. adr
  2. transient
  3. group rate
  4. Guide to Hotel Management
  5. distribution channels

5 min EDA

train %>% head

Not sure exactly what ‘reservation_status_date’ refers to; for a date field, will create an ‘arrival.date’ col to use in its place

skimr::skim(train)
Data summary
Name train
Number of rows 56375
Number of columns 31
_______________________
Column type frequency:
Date 1
factor 13
numeric 17
________________________
Group variables None

Variable type: Date

skim_variable n_missing complete_rate min max median n_unique
reservation_status_date 0 1 2015-07-01 2017-09-14 2016-09-02 804

Variable type: factor

skim_variable n_missing complete_rate ordered n_unique top_counts
agent 0 1 FALSE 300 9: 14052, NUL: 9261, 240: 6312, 7: 2273
arrival_date_month 0 1 FALSE 12 Aug: 6441, Jul: 5884, May: 5370, Oct: 5179
assigned_room_type 0 1 FALSE 10 A: 30838, D: 14175, E: 4385, F: 2144
company 0 1 FALSE 308 NUL: 52146, 40: 632, 223: 499, 45: 175
country 0 1 FALSE 155 PRT: 15765, GBR: 7211, FRA: 6412, ESP: 4768
customer_type 0 1 FALSE 4 Tra: 39766, Tra: 14099, Con: 2114, Gro: 396
deposit_type 0 1 FALSE 3 No : 56210, Ref: 100, Non: 65
distribution_channel 0 1 FALSE 5 TA/: 43282, Dir: 9068, Cor: 3905, GDS: 119
hotel 0 1 FALSE 2 Cit: 34671, Res: 21704
market_segment 0 1 FALSE 7 Onl: 26811, Off: 11860, Dir: 7962, Gro: 5842
meal 0 1 FALSE 5 BB: 43365, HB: 7065, SC: 5036, Und: 670
reservation_status 0 1 FALSE 1 Che: 56375, Can: 0, No-: 0
reserved_room_type 0 1 FALSE 9 A: 39280, D: 9765, E: 3508, F: 1534

Variable type: numeric

skim_variable n_missing complete_rate mean sd p0 p25 p50 p75 p100 hist
adr 0 1 100.02 49.30 -6.38 67.5 92.28 125 510 <U+2587><U+2586><U+2581><U+2581><U+2581>
adults 0 1 1.83 0.51 0.00 2.0 2.00 2 4 <U+2581><U+2582><U+2587><U+2581><U+2581>
arrival_date_day_of_month 0 1 15.86 8.78 1.00 8.0 16.00 23 31 <U+2587><U+2587><U+2587><U+2587><U+2586>
arrival_date_week_number 0 1 27.10 13.94 1.00 16.0 28.00 38 53 <U+2586><U+2587><U+2587><U+2587><U+2586>
arrival_date_year 0 1 2016.15 0.70 2015.00 2016.0 2016.00 2017 2017 <U+2583><U+2581><U+2587><U+2581><U+2586>
babies 0 1 0.01 0.12 0.00 0.0 0.00 0 10 <U+2587><U+2581><U+2581><U+2581><U+2581>
booking_changes 0 1 0.30 0.74 0.00 0.0 0.00 0 21 <U+2587><U+2581><U+2581><U+2581><U+2581>
children 0 1 0.10 0.39 0.00 0.0 0.00 0 3 <U+2587><U+2581><U+2581><U+2581><U+2581>
days_in_waiting_list 0 1 1.60 14.89 0.00 0.0 0.00 0 379 <U+2587><U+2581><U+2581><U+2581><U+2581>
is_repeated_guest 0 1 0.04 0.20 0.00 0.0 0.00 0 1 <U+2587><U+2581><U+2581><U+2581><U+2581>
lead_time 0 1 79.72 90.84 0.00 9.0 45.00 123 709 <U+2587><U+2582><U+2581><U+2581><U+2581>
previous_bookings_not_canceled 0 1 0.20 1.84 0.00 0.0 0.00 0 72 <U+2587><U+2581><U+2581><U+2581><U+2581>
previous_cancellations 0 1 0.02 0.26 0.00 0.0 0.00 0 13 <U+2587><U+2581><U+2581><U+2581><U+2581>
required_car_parking_spaces 0 1 0.10 0.30 0.00 0.0 0.00 0 8 <U+2587><U+2581><U+2581><U+2581><U+2581>
stays_in_week_nights 0 1 2.46 1.92 0.00 1.0 2.00 3 41 <U+2587><U+2581><U+2581><U+2581><U+2581>
stays_in_weekend_nights 0 1 0.93 0.99 0.00 0.0 1.00 2 16 <U+2587><U+2581><U+2581><U+2581><U+2581>
total_of_special_requests 0 1 0.72 0.84 0.00 0.0 1.00 1 5 <U+2587><U+2581><U+2581><U+2581><U+2581>

clean/encode data

# make arrival date var
train = train %>% mutate(
  arrival.date = make_date(
    year = arrival_date_year,
    month = match(arrival_date_month, month.name),
    day = arrival_date_day_of_month)
  ) %>% arrange(arrival.date)

### will use this for later on
train.anomaly = train

# these numeric vars s/b factor vars
train = train %>% mutate_at(vars(arrival_date_day_of_month, arrival_date_week_number, arrival_date_year, is_repeated_guest), factor)

# reordering df
train = train %>% select(sort(tidyselect::peek_vars())) %>% 
  select(
    where(is.Date),
    where(is.factor),
    where(is.numeric)
  )

You’ll have to repeat the steps above for the test data when preprocessing with recipes and step_XXX’

EDA: time series

range

paste(
  'The date range of this dataset is from',
  train %>% pull(arrival.date) %>% range %>% .[1],
  'to',
  train %>% pull(arrival.date) %>% range %>% .[2],
  ', just over 3 years of data.'
)
## [1] "The date range of this dataset is from 2015-07-01 to 2017-08-31 , just over 3 years of data."

time series graph – ungrouped

train %>% group_by(arrival.date) %>% 
  summarise(total.individuals = sum(adults, children)) %>% 
  arrange(arrival.date) %>%
  plot_ly(
    x = ~arrival.date,
    y = ~total.individuals
  ) %>% layout(
    title = 'total.individuals by date',
    xaxis = list(title = ''),
    yaxis = list(title = '')
    )

time series graph – grouped (hotel)

train %>% group_by(arrival.date, hotel) %>% 
  summarise(total.individuals = sum(adults, children)) %>% 
  arrange(arrival.date) %>%
  plot_ly(
    x = ~arrival.date,
    y = ~total.individuals,
    color = ~hotel,
    alphtrain = 0.7
  )  %>% layout(
    title = 'total.individuals by date/hotel',
    xaxis = list(title = ''),
    yaxis = list(title = '')
    ) 

time series graph – grouped (customer_type)

train %>% group_by(arrival.date, customer_type) %>% 
  summarise(total.individuals = sum(adults, children)) %>%  
  arrange(arrival.date) %>%
  plot_ly(
    x = ~arrival.date,
    y = ~total.individuals,
    color = ~customer_type,
    alphtrain = 0.7
  ) %>% layout(
    title = 'total.individuals by date/customer_type',
    xaxis = list(title = ''),
    yaxis = list(title = '')
    )

time series graph – grouped (deposit_type)

train %>% group_by(arrival.date, deposit_type) %>% 
  summarise(total.individuals = sum(adults, children)) %>%   
  arrange(arrival.date) %>%
  plot_ly(
    x = ~arrival.date,
    y = ~total.individuals,
    color = ~deposit_type,
    alphtrain = 0.7
  ) %>% layout(
    title = 'total.individuals by date/deposit_type',
    xaxis = list(title = ''),
    yaxis = list(title = '')
    )

time series graph – grouped (distribution_channel)

train %>% group_by(arrival.date, distribution_channel) %>% 
  summarise(total.individuals = sum(adults, children)) %>%  
  arrange(arrival.date) %>%
  plot_ly(
    x = ~arrival.date,
    y = ~total.individuals,
    color = ~distribution_channel,
    alphtrain = 0.7
  ) %>% layout(
    title = 'total.individuals by date/distribution_channel',
    xaxis = list(title = ''),
    yaxis = list(title = '')
    )

EDA: nom vars

sample data

train %>% select(where(is.factor)) %>% slice_sample(n = 10)

glimpse structure

train %>% select(where(is.factor)) %>% glimpse
## Rows: 56,375
## Columns: 17
## $ agent                     <fct> NULL, NULL, 240, 240, 240, 240, 241, 8, 2...
## $ arrival_date_day_of_month <fct> 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,...
## $ arrival_date_month        <fct> July, July, July, July, July, July, July,...
## $ arrival_date_week_number  <fct> 27, 27, 27, 27, 27, 27, 27, 27, 27, 27, 2...
## $ arrival_date_year         <fct> 2015, 2015, 2015, 2015, 2015, 2015, 2015,...
## $ assigned_room_type        <fct> C, C, A, A, D, E, G, E, E, G, F, A, A, D,...
## $ company                   <fct> NULL, NULL, NULL, NULL, NULL, NULL, NULL,...
## $ country                   <fct> PRT, GBR, GBR, GBR, PRT, USA, ESP, PRT, I...
## $ customer_type             <fct> Transient, Transient, Transient, Transien...
## $ deposit_type              <fct> No Deposit, No Deposit, No Deposit, No De...
## $ distribution_channel      <fct> Direct, Direct, TA/TO, TA/TO, TA/TO, TA/T...
## $ hotel                     <fct> Resort Hotel, Resort Hotel, Resort Hotel,...
## $ is_repeated_guest         <fct> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,...
## $ market_segment            <fct> Direct, Direct, Online TA, Online TA, Onl...
## $ meal                      <fct> BB, BB, BB, BB, HB, BB, HB, BB, BB, BB, B...
## $ reservation_status        <fct> Check-Out, Check-Out, Check-Out, Check-Ou...
## $ reserved_room_type        <fct> C, A, A, A, D, D, G, E, A, A, F, A, A, D,...

check missing values

train %>% select(where(is.factor)) %>% miss_var_summary

distribution of level counts per factor

jpal = colorRampPalette(brewer.pal(8,'Dark2'))(15)

train %>% select(where(is.factor)) %>%
  map(n_unique) %>%
  as.tibble() %>%
  pivot_longer(everything()) %>%
  plot_ly(y = ~name, x = ~value, color = ~name, colors = jpal) %>%
  add_bars() %>%
  hide_legend() %>% 
  layout(
    title = 'distribution of level counts per factor',
    xaxis = list(title = ''),
    yaxis = list(title = '')
    )

reference: names of unique levels

train %>% select(where(is.factor)) %>%
  map(unique)
## $agent
##   [1] NULL 240  303  241  8    250  115  134  156  243  242  5    40   105  147 
##  [16] 306  184  175  96   2    127  95   6    244  15   167  300  305  196  152 
##  [31] 142  171  177  36   67   104  261  149  26   258  71   146  181  88   143 
##  [46] 251  275  69   256  314  110  126  281  208  253  330  328  326  334  185 
##  [61] 321  324  313  38   155  68   308  332  387  298  273  248  315  75   307 
##  [76] 436  201  183  223  94   3    16   468  446  327  34   78   385  339  139 
##  [91] 9    270  47   128  154  114  66   29   301  245  193  1    182  336  135 
## [106] 348  350  195  352  355  159  10   384  363  360  375  331  367  91   64  
## [121] 393  168  406  249  405  414  333  11   427  431  429  426  430  438  418 
## [136] 441  282  432  72   450  434  454  455  59   368  254  57   180  358  464 
## [151] 411  481  165  467  510  531  440  337  526  493  420  502  527  469  410 
## [166] 479  508  163  535  302  13   7    27   22   17   28   14   42   20   19  
## [181] 45   37   61   39   21   24   50   30   52   12   44   31   83   32   63  
## [196] 56   89   86   79   132  4    70   82   81   74   92   99   85   87   112 
## [211] 106  98   111  119  148  151  138  103  121  118  211  153  210  187  129 
## [226] 213  174  220  173  216  232  35   23   58   205  157  133  150  214  290 
## [241] 192  191  267  215  252  247  219  280  285  289  269  295  265  335  288 
## [256] 122  325  234  341  310  344  77   346  359  283  364  370  33   371  25  
## [271] 53   227  141  378  391  397  416  404  73   354  444  296  425  461  394 
## [286] 390  388  453  262  459  474  229  475  480  423  484  495  509  449 
## 334 Levels: 1 10 103 104 105 106 107 11 110 111 112 114 115 117 118 119 ... NULL
## 
## $arrival_date_day_of_month
##  [1] 1  2  3  4  5  6  7  8  9  10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25
## [26] 26 27 28 29 30 31
## 31 Levels: 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 ... 31
## 
## $arrival_date_month
##  [1] July      August    September October   November  December  January  
##  [8] February  March     April     May       June     
## 12 Levels: April August December February January July June March ... September
## 
## $arrival_date_week_number
##  [1] 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51
## [26] 52 53 1  2  3  4  5  6  7  8  9  10 11 12 13 14 15 16 17 18 19 20 21 22 23
## [51] 24 25 26
## 53 Levels: 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 ... 53
## 
## $arrival_date_year
## [1] 2015 2016 2017
## Levels: 2015 2016 2017
## 
## $assigned_room_type
##  [1] C A G E F D I B H K
## Levels: A B C D E F G H I K L P
## 
## $company
##   [1] NULL 110  113  270  240  154  144  307  268  59   312  318  274  174  195 
##  [16] 223  317  118  281  47   324  342  371  286  88   331  178  405  367  337 
##  [31] 20   94   53   528  62   120  42   82   81   116  530  103  112  135  9   
##  [46] 39   204  16   92   31   61   356  12   457  501  86   165  291  292  290 
##  [61] 192  108  51   43   34   224  388  269  465  287  297  490  482  207  169 
##  [76] 282  437  263  225  329  28   200  338  72   319  146  159  380  323  407 
##  [91] 421  325  80   403  399  14   84   343  346  347  349  289  351  355  353 
## [106] 99   250  358  361  390  362  366  365  277  109  377  379  22   378  83  
## [121] 364  360  401  232  511  384  167  212  514  391  400  376  392  396  302 
## [136] 370  397  369  409  251  168  104  382  408  413  148  10   333  419  415 
## [151] 424  425  423  428  422  395  435  439  445  448  443  454  444  394  52  
## [166] 459  458  456  460  447  470  255  466  484  184  485  32   487  491  494 
## [181] 193  516  499  308  29   78   254  504  130  520  507  498  515  512  126 
## [196] 64   330  242  477  518  521  523  539  534  436  525  541  40   455  45  
## [211] 38   49   67   68   8    46   76   96   115  105  93   101  11   91   137 
## [226] 139  142  127  107  140  143  149  150  163  85   180  238  219  179  221 
## [241] 183  153  197  203  185  217  209  215  213  186  237  216  230  234  227 
## [256] 246  245  218  158  258  259  260  411  272  257  271  18   275  210  273 
## [271] 278  71   284  301  233  305  293  264  311  304  313  288  320  334  314 
## [286] 332  341  352  350  372  73   383  368  393  220  412  420  410  426  417 
## [301] 243  429  433  446  450  418  280  357  479  478  483  489  486  481  497 
## [316] 451  492 
## 353 Levels: 10 100 101 102 103 104 105 106 107 108 109 11 110 112 113 ... NULL
## 
## $country
##   [1] PRT  GBR  ESP  IRL  FRA  NULL ROU  NOR  ARG  USA  POL  DEU  CHE  CN   GRC 
##  [16] ITA  DNK  RUS  SWE  EST  CZE  FIN  AUS  MOZ  BWA  NLD  ALB  BRA  BEL  IND 
##  [31] CHN  MAR  SVN  UKR  SMR  LVA  SRB  CHL  AUT  BLR  LTU  OMN  TUR  MEX  ISR 
##  [46] LUX  ZMB  CPV  ZWE  DZA  KOR  CRI  HUN  HRV  CAF  CYP  NZL  IDN  THA  DOM 
##  [61] NGA  GIB  ARM  LKA  CUB  COL  ZAF  CMR  IRN  BIH  MUS  COM  JAM  UGA  BGR 
##  [76] CIV  JOR  PRI  JPN  MYS  AGO  URY  SAU  SGP  KWT  LBN  AND  VNM  PLW  ARE 
##  [91] QAT  EGY  SVK  SUR  MDV  MLT  MWI  ISL  UZB  ECU  NPL  KAZ  BHS  PAK  MAC 
## [106] TGO  TWN  HKG  DJI  PHL  GEO  TUN  STP  VEN  AZE  SEN  PER  KNA  ETH  IRQ 
## [121] RWA  MMR  TMP  BFA  KEN  MCO  LBY  PAN  GNB  TZA  BGD  BHR  NAM  BOL  SYC 
## [136] BRB  AIA  SLV  PYF  LIE  ATA  GAB  MKD  MNE  GTM  GHA  ASM  PRY  SYR  ABW 
## [151] NCL  ATF  SLE  LAO  FRO 
## 178 Levels: ABW AGO AIA ALB AND ARE ARG ARM ASM ATA ATF AUS AUT AZE BDI ... ZWE
## 
## $customer_type
## [1] Transient       Contract        Transient-Party Group          
## Levels: Contract Group Transient Transient-Party
## 
## $deposit_type
## [1] No Deposit Refundable Non Refund
## Levels: No Deposit Non Refund Refundable
## 
## $distribution_channel
## [1] Direct    TA/TO     Corporate Undefined GDS      
## Levels: Corporate Direct GDS TA/TO Undefined
## 
## $hotel
## [1] Resort Hotel City Hotel  
## Levels: City Hotel Resort Hotel
## 
## $is_repeated_guest
## [1] 0 1
## Levels: 0 1
## 
## $market_segment
## [1] Direct        Online TA     Offline TA/TO Corporate     Complementary
## [6] Groups        Aviation     
## 8 Levels: Aviation Complementary Corporate Direct Groups ... Undefined
## 
## $meal
## [1] BB        FB        HB        SC        Undefined
## Levels: BB FB HB SC Undefined
## 
## $reservation_status
## [1] Check-Out
## Levels: Canceled Check-Out No-Show
## 
## $reserved_room_type
## [1] C A G E D F H L B
## Levels: A B C D E F G H L P
train = train %>% mutate(arrival_date_month = factor(arrival_date_month, levels = c('January','February','March','April','May','June','July','August','September','October','November','December')))

EDA: num vars

check missing values

train %>% select(where(is.numeric)) %>% miss_var_summary

sample data

train %>% select(where(is.numeric)) %>% slice_sample(n = 10)

glimpse structure

train %>% select(where(is.numeric)) %>% glimpse
## Rows: 56,375
## Columns: 13
## $ adr                            <dbl> 0.00, 0.00, 75.00, 98.00, 98.00, 107...
## $ adults                         <dbl> 2, 2, 1, 2, 2, 2, 2, 2, 2, 2, 2, 2, ...
## $ babies                         <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...
## $ booking_changes                <dbl> 3, 4, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, ...
## $ children                       <dbl> 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, ...
## $ days_in_waiting_list           <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...
## $ lead_time                      <dbl> 342, 737, 7, 14, 14, 0, 9, 18, 37, 6...
## $ previous_bookings_not_canceled <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...
## $ previous_cancellations         <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...
## $ required_car_parking_spaces    <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...
## $ stays_in_week_nights           <dbl> 0, 0, 1, 2, 2, 2, 2, 4, 4, 4, 4, 1, ...
## $ stays_in_weekend_nights        <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...
## $ total_of_special_requests      <dbl> 0, 0, 0, 1, 1, 0, 1, 1, 0, 3, 0, 0, ...

viz: outliers

train %>% select(where(is.numeric)) %>% dlookr::plot_outlier()

There are many upper outliers. When building a prediction model, perhaps we should consider transforming them or removing them entirely.

check right tailness

jquantiles = function(col){quantile(col, probs = c(0.90, 0.95, 0.99, 1))}

train %>% na.omit %>% select(where(is.numeric)) %>%
  map(.x = . , jquantiles) %>%
  as.data.frame.list() %>%
  rownames_to_column() %>%
  as.tibble()

viz: normality

train %>% select(where(is.numeric)) %>% dlookr::plot_normality()

viz: distribution histogram (with outliers)

#with outliers
train %>% select(where(is.numeric)) %>% DataExplorer::plot_histogram(nrow = 2, ncol = 1)

viz: distribution histogram (without most extreme outliers)

#no extreme right tailed outliers
train %>% select(where(is.numeric)) %>% filter(
  adr != 510,
  adults != 55,
  babies != 10,
  booking_changes != 21,
  children != 10,
  days_in_waiting_list != 391,
  lead_time != 709,
  previous_bookings_not_canceled != 72,
  previous_cancellations != 26,
  required_car_parking_spaces != 8,
  stays_in_week_nights != 50,
  stays_in_weekend_nights != 19
) %>% DataExplorer::plot_histogram(nrow = 2, ncol = 1)

viz: distribution bivariate (without extreme outliers)

#no outliers
train %>% select(hotel, where(is.numeric)) %>% filter(
  adr != 510,
  adults != 55,
  babies != 10,
  booking_changes != 21,
  children != 10,
  days_in_waiting_list != 391,
  lead_time != 709,
  previous_bookings_not_canceled != 72,
  previous_cancellations != 26,
  required_car_parking_spaces != 8,
  stays_in_week_nights != 50,
  stays_in_weekend_nights != 19
) %>% DataExplorer::plot_boxplot(by = 'hotel', nrow = 3, ncol = 1)

EDA: ADR further investigation

CAUTION: You should calculate ADR per adult since the ds is at the booking level not the individual level. Also you should remove bookings with children

#check: check number of bookings by combination of adult and children counts
train %>% count(adults, children)
train %>% count(adults, children) %>% filter(adults == 0) %>% summarise(total.bookings.without.adults = sum(n))
train %>% count(adults, children) %>% filter(children == 0) %>% summarise(total.bookings.without.children = sum(n))
train %>% count(adults, children) %>% filter(children == 0, adults == 0) %>% summarise(total.bookings.without.adults.and.children = sum(n))

There are very few bookings without adults, but it’s good to filter out to get the most accurate results.’

percent of ds with only adult booking data

paste(
  scales::percent(
    nrow(train %>% filter(adults > 0, children == 0)) / nrow(train)
    ),
  'of all observations have 1 or more adults and zero children.'
)
## [1] "93% of all observations have 1 or more adults and zero children."
train.adult.adrs = train %>% filter(adults > 0, children == 0)

distribution ADR per child

train %>% filter(adults == 0, children >0) %>% mutate(adr.per.child = adr/children) %>% plot_ly(x = ~adr.per.child)  %>% add_boxplot()

Median ADR per child is $43.54

create adr.per.adult var

#creating adr per adult var
train.adult.adrs = train.adult.adrs %>% mutate(adr.per.adult = adr/adults)

train.adult.adrs %>%
  plot_ly(y = ~hotel, x = ~adr.per.adult, color = ~hotel, colors = jpal[1:2]) %>% 
  add_boxplot() %>% 
  layout(
    title = 'ADR per Adult by Hotel Type',
    xaxis = list(title = ''),
    yaxis = list(title = '')
  )
#https://stackoverflow.com/questions/57300053/split-a-plotly-boxplot-x-axis-by-group
train.adult.adrs %>%
  plot_ly(y = ~hotel, x = ~adr.per.adult, color = ~customer_type, colors = jpal, group = ~customer_type) %>% 
  add_boxplot() %>% 
  layout(
    boxmode = 'group', #SUPER IMPORTANT
    title = 'ADR per Adult by Hotel/customer_type'
    ) 
#https://stackoverflow.com/questions/57300053/split-a-plotly-boxplot-x-axis-by-group
train.adult.adrs %>%
  plot_ly(y = ~hotel, x = ~adr, color = ~market_segment, colors = jpal, group = ~market_segment) %>% 
  add_boxplot() %>% 
  layout(
    boxmode = 'group', #SUPER IMPORTANT
    title = 'ADR per Adult by Hotel/market_segment'
    ) 
#https://stackoverflow.com/questions/57300053/split-a-plotly-boxplot-x-axis-by-group
train.adult.adrs %>%
  plot_ly(x = ~hotel, y = ~adr, color = ~arrival_date_month, colors = jpal, group = ~arrival_date_month) %>% 
  add_boxplot() %>% 
  layout(
    boxmode = 'group', #SUPER IMPORTANT
    title = 'ADR per Adult by Hotel/arrival_date_month',
    hoverformat = '.0f'
    ) 
library(DescTools)
## 
## Attaching package: 'DescTools'
## The following object is masked from 'package:data.table':
## 
##     %like%

Observations

  1. On average, the Resort Hotel ($40) is cheaper than the City Hotel ($54)
  2. Not surprisingly, ‘Direct’ market segment customers pay more, while Group and Corporate customers pay less
  3. Not surprisingly, ADR rates are higher for the Resort Hotel in summer months

correlations: viz

train %>% select(where(is.numeric)) %>% dlookr::plot_correlate()

train %>% select(where(is.numeric)) %>% GGally::ggcorr(palette = "RdBu", label = TRUE)

Goal 1: Predict Number of Adults by Hotel Type

Aggregating at the weekly level to smooth out daily fluctuations

creating time series dataset

creating a separate time series ds for anomaly detection and predictions b/c time series ds needs to be split chronologically via ‘initial_time_split’, not randomly via ‘initial_split’

create ts dataset, filter, and split

#times series dataset
ts =  a %>% na.omit %>% filter(
  adults > 0,
  children == 0
) %>% mutate(
  arrival.date = lubridate::make_date(
      year = arrival_date_year,
      month = match(arrival_date_month, month.name),
      day = arrival_date_day_of_month
  )) %>% mutate(
    arrival.week = floor_date(arrival.date, 'week', week_start = 7)
  ) %>% group_by(arrival.week, hotel) %>% 
  summarise(total.adult.bookings = sum(adults), .groups = 'drop') %>% 
  arrange(arrival.week)
  
#---------------------------- city

ts.city = ts %>% filter(hotel == 'City Hotel')
ts.city$hotel = NULL
ts.city.split = initial_time_split(ts.city)
ts.city.train = training(ts.city.split)
ts.city.test = testing(ts.city.split)

#---------------------------- resort

ts.resort = ts %>% filter(hotel == 'Resort Hotel')
ts.resort$hotel = NULL
ts.resort.split = initial_time_split(ts.resort)
ts.resort.train = training(ts.resort.split)
ts.resort.test = testing(ts.resort.split)

Anomaly Detection

library(anomalize)
# time_decompose(data, target, method = c("stl", "twitter"), frequency = "auto", trend = "auto", ..., merge = FALSE, message = TRUE)
# anomalize(data, target, method = c("iqr", "gesd"), alpha = 0.05, max_anoms = 0.2, verbose = FALSE)
# The alpha parameter adjusts the width of the critical values. By default, alpha = 0.05.
# Lower values are more conservative while higher values are less prone to incorrectly classifying "normal" observations.
# max_anoms: The maximum percent of anomalies permitted to be identified.

# The STL method uses the stl() function from the stats package. STL works very well in circumstances where a long term trend is present (which applies in this case; see trend component in the prophet graphs below'). 
  
(ts.city.train %>% as.tibble() %>% 
  time_decompose(total.adult.bookings, method = 'stl', merge = TRUE) %>%
  anomalize(remainder, alpha = 0.15, method = 'gesd') %>% #increasing sensitivity to outliers
  time_recompose())
ggplotly(
  ts.city.train %>% as.tibble() %>% 
  time_decompose(total.adult.bookings, method = 'stl', merge = TRUE) %>%
  anomalize(remainder, alpha = 0.15, method = 'gesd') %>% #increasing sensitivity to outliers
  time_recompose() %>% 
    plot_anomalies(
      ncol = 2,
      alpha_dots = 0.5,
      alpha_circles = 0.5,
      size_circles = 2,
      time_recomposed = TRUE,
      alpha_ribbon = 0.05
      ) + scale_y_continuous(labels = comma) +
    labs(x = '', y = 'total.bookings', title = 'city hotel total.adult.bookings')
  )
#----------------------------

(ts.resort.train %>% as.tibble() %>% 
  time_decompose(total.adult.bookings, method = 'stl', merge = TRUE) %>%
  anomalize(remainder, alpha = 0.15, method = 'gesd') %>% #increasing sensitivity to outliers
  time_recompose())
ggplotly(
  ts.resort.train %>% as.tibble() %>% 
  time_decompose(total.adult.bookings, method = 'stl', merge = TRUE) %>%
  anomalize(remainder, alpha = 0.15, method = 'gesd') %>% #increasing sensitivity to outliers
  time_recompose() %>% 
    plot_anomalies(
      ncol = 2,
      alpha_dots = 0.5,
      alpha_circles = 0.5,
      size_circles = 2,
      time_recomposed = TRUE,
      alpha_ribbon = 0.05
      ) + scale_y_continuous(labels = comma) +
    labs(x = '', y = 'total.bookings', title = 'resort hotel total adults')
  )

Predicting Next 4 Wks Total Adults by Hotel Type

City Hotel

library(prophet)
## Loading required package: Rcpp
## 
## Attaching package: 'Rcpp'
## The following object is masked from 'package:rsample':
## 
##     populate
## Loading required package: rlang
## 
## Attaching package: 'rlang'
## The following objects are masked from 'package:purrr':
## 
##     %@%, as_function, flatten, flatten_chr, flatten_dbl, flatten_int,
##     flatten_lgl, flatten_raw, invoke, list_along, modify, prepend,
##     splice
## The following object is masked from 'package:data.table':
## 
##     :=
# https://dygraphs.com/options.html

#renaming cols to prophet's col conventions
prophet.city.df = ts.city.train %>% select(ds = arrival.week, y = total.adult.bookings)

#creating model
prophet.city.mdl = prophet.city.df %>% prophet(yearly.seasonality = TRUE)
## Disabling weekly seasonality. Run prophet with weekly.seasonality=TRUE to override this.
## Disabling daily seasonality. Run prophet with daily.seasonality=TRUE to override this.
#using model make future period df
prophet.city.future.df = prophet.city.mdl %>% make_future_dataframe(
  periods = 4, #next 4 wks
  freq = 'week',
  include_history = TRUE
  )

#make forecasts df
prophet.city.forecast.df = prophet.city.mdl %>% predict(prophet.city.future.df)

prophet.city.forecast.df %>% head %>% DT::datatable()
#plot forecast
prophet.city.mdl %>% dyplot.prophet(
  prophet.city.forecast.df,
  main = '<h style="color: black; font-size:18px;">City Hotel: Total Adults 4 Week Prediction</h>'
  ) %>%
  dygraphs::dyOptions(
    colors = c('darkgreen','blue'),
    pointSize = 2,
    )
## Warning: `select_()` is deprecated as of dplyr 0.7.0.
## Please use `select()` instead.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_warnings()` to see where this warning was generated.
#plot forecast components
prophet.city.mdl %>% prophet_plot_components(prophet.city.forecast.df)

measure model performance - city

(prophet.city.preds.truth = tibble(
  arrival.date = ts.city.test %>% pull(arrival.week) %>% head(4),
  truth =  ts.city.test %>% pull(total.adult.bookings) %>% head(4),
  estimate = prophet.city.forecast.df %>% pull(yhat) %>% tail(4)
))
bind_rows(
prophet.city.preds.truth %>% yardstick::rmse(truth = truth, estimate = estimate),
prophet.city.preds.truth %>% yardstick::mae(truth = truth, estimate = estimate)
)

On average, our weekly predictions are off by 89 people

Resort Hotel

library(prophet)
# https://dygraphs.com/options.html

#renaming cols to prophet's col conventions
prophet.resort.df = ts.resort.train %>% select(ds = arrival.week, y = total.adult.bookings)

#creating model
prophet.resort.mdl = prophet.resort.df %>% prophet(yearly.seasonality = TRUE)
## Disabling weekly seasonality. Run prophet with weekly.seasonality=TRUE to override this.
## Disabling daily seasonality. Run prophet with daily.seasonality=TRUE to override this.
#using model make future period df
prophet.resort.future.df = prophet.resort.mdl %>% make_future_dataframe(
  periods = 4, #next 4 wks
  freq = 'week',
  include_history = TRUE
  )

#make forecasts df
prophet.resort.forecast.df = prophet.resort.mdl %>% predict(prophet.resort.future.df)

prophet.resort.forecast.df %>% head %>% DT::datatable()
#plot forecast
prophet.resort.mdl %>% dyplot.prophet(
  prophet.resort.forecast.df,
  main = '<h style="color: black; font-size:18px;">Resort Hotel: Total Adults 4 Week Prediction</h>'
  ) %>%
  dygraphs::dyOptions(
    colors = c('darkorange','blue'),
    pointSize = 2,
    )
#plot forecast components
prophet.resort.mdl %>% prophet_plot_components(prophet.resort.forecast.df)

(prophet.resort.preds.truth = tibble(
  arrival.date = ts.resort.test %>% pull(arrival.week) %>% head(4),
  truth =  ts.resort.test %>% pull(total.adult.bookings) %>% head(4),
  estimate = prophet.resort.forecast.df %>% pull(yhat) %>% tail(4)
))
bind_rows(
prophet.resort.preds.truth %>% yardstick::rmse(truth = truth, estimate = estimate),
prophet.resort.preds.truth %>% yardstick::mae(truth = truth, estimate = estimate)
)

On average, our weekly predictions are off by 78 people

Comments

  1. Maybe it’s easier to predict number of bookings as opposed to number of adults (one booking can have one or more adults)?